*********************************************************************************
*** Preparing a regression data file 
*********************************************************************************

#delimit ;  set type double, permanentely ; 
clear ;  clear matrix ; clear mata ; 
set matsize 5000 ;  set maxvar 5000 ;   set more off;   set rmsg on ;  pause on;  

tempfile f1 census county ; 

*** set path here ***

*** Prepare a working file ; 
*** Tap water penetration and other control variables from 2010 census data ; 

use $path\census_county_2010.dta, clear ;   // from census
drop m5- f85 ;  
ren county_code census2010_code ; 
* gen code=county_code ; 
save `census' ; 


use $rawdata\county_2010.dta, clear ;    // county information (poverty) collected by myself 
keep county_code 国内生产总值 mountains hills minorityauto ; 
ren 国内生产总值 gdp ;  
ren county_code code ;  
drop if code==. ;   
save `county' ;  


use $pathdata\IMR_water_source_match.dta, clear ;      // water sources 
merge 1:1 census2010_code using `census' ; 
list code jcdmc if _merge==1 ;   
drop if _merge==2 ; 
drop _merge ; 
merge 1:1 code using `county' ; 
list code jcdmc if _merge==1 ;   
drop if _merge==2 ; 
drop _merge ; 

unique code ; 


merge 1:m code  using $pathdata\IMR_07_15.dta ; 
tab _merge ; 
list code jcdmc year if _merge~=3 ;  
drop _merge ;  
merge 1:1 code year using $pathdata\IMR_water_pollution_match.dta ;     // water pollution 
drop _merge ;  
merge 1:1 code year using $pathdata\IMR_airpollution_match.dta ;     // air pollution 
drop _merge ;  
merge m:1 code using $pathdata\mig_pattern.dta ;   // From 2005 census    
drop _merge ;  
unique code ;  
save `f1' ;  


*** Add controls from for IMR sites in 2010 ; 

import excel "$path\annual controls imr county 2010.xlsx", sheet("Sheet1") firstrow clear ;   
ren 地区ID code ; 
recode code (419999=419001) ;  

recode Mountain (.=0) ; 

drop 年份  ; 
ren 地区名称             Name ; 
ren 监测地区编码         site_code ; 
ren 总人口数             population ; 
ren 人均收入             pcincome ; 
ren 财政支出总额         fisexp ; 
ren 财政支出卫生~费      fisexp_h ;    
ren 财政支出妇幼~费      fisexp_inf ; 
ren 妇幼保健~构总数      inst_inf ; 
ren 妇幼~构人员总数      staff_inf ; 
ren 妇幼~术人员总数      tech_inf ; 

merge 1:m code using `f1' ;  
unique code if _merge~=3 ;        
list code jcdmc Name _merge if _merge~=3 ;    
drop _merge ;  

mdesc ;              
order code year Name ; 
sort code year ;  
drop if imr==. ;   
gen minority_county =  strpos(jcdmc, "自治") > 0 ;       
label var minority_county  "名字里有自治两个字" ;      
save $pathdata\IMR_workfile.dta, replace ; 



*** GIS map ; 

keep if year==2010 ; 
keep code tapwater sharetapwater longitude0 latitude0 ; 
export excel using $gis\IMR_tapwatercoverage.xlsx, firstrow(variables) replace ; 


tempfile f1 f2 f3 ; 
import excel $path\PKU_cost_distance_calculation.xlsx, sheet("Sheet1") firstrow clear ;         
assert imr_id == iid ;   
count if river_length==0 ;  
count if dem_length==0 ;   

preserve ;  
sort imr_id dem_length ; 
bysort imr_id: keep if _n==1 ;    
ren dem_length cost_dem_river_1 ;   
save `f1' ;   
restore ;  

preserve ;  
drop if type_eng=="underground water" ;   
sort imr_id dem_length ; 
bysort imr_id: keep if _n==1 ;    
ren dem_length cost_dem_river_2 ;   
save `f2' ;   
restore ;  

preserve ;  
sort imr_id river_length ; 
bysort imr_id: keep if _n==1 ;    
ren river_length cost_river_1 ;   
save `f3' ;   
restore ;  

drop if type_eng=="underground water" ;   
sort imr_id river_length ; 
bysort imr_id: keep if _n==1 ;    
ren river_length cost_river_2 ;   

merge 1:1 imr_id using `f1' ;    drop _merge ; 
merge 1:1 imr_id using `f2' ;    drop _merge ; 
merge 1:1 imr_id using `f3' ;    drop _merge ; 

keep code cost* ;   

merge 1:m code using $pathdata\IMR_workfile.dta ;  drop _merge ;  
order cost*, last ;  
save $pathdata\IMR_workfile.dta, replace ;  



